*___________________________________________________________________________________________________________________________________________________________________
*
**# MERGING THE FILES
*___________________________________________________________________________________________________________________________________________________________________

*unicode encoding list			// to see different possible unicode encodings

// Saving Stata files from each of the .csv files
local list : dir "$EHIS\EHIS wave 2" files "*.csv", respectcase
foreach file in `list'	{
	dis `"`file'"'
	import delimited `"$EHIS\EHIS wave 2\\`file'"', varnames(1) delimit(";") clear bindquote(nobind)		// charset(utf8)
	/* NOTE: this didn't cause an error when I first ran it, but does ever since... I think it's because I was trying to import Stata files as .csvs by mistake! Reassurance also from reading e.g. https://www.statalist.org/forums/forum/general-stata-discussion/general/1623219-binary-zeros-were-ignored-in-the-source-file */
	local newfile = subinstr(`"`file'"', "_Anonymisation.csv", "", .)
	capture drop region			// this is sometimes numeric, and sometimes string (in DE) - to avoid append issues, easiest to drop it
	save "$EHIS\EHIS wave 2\\`newfile'.dta", replace
/**/					}


// Combining these together
local dtalist : dir "$EHIS\EHIS wave 2" files "??.dta"
local i = 1
foreach dta in `dtalist'	{
	dis `"`dta'"'
	if `i'!=1					{
		append using `"${EHIS}\EHIS wave 2\\`dta'"'
		erase `"${EHIS}\EHIS wave 2\\`dta'"'
	/**/						}
	if `i'==1					{
		use `"${EHIS}\EHIS wave 2\\`dta'"', replace
		erase `"${EHIS}\EHIS wave 2\\`dta'"'
		local i = `i' + 1
	/**/						}
/**/					}
* Variables in different formats in UK vs. EU data
rename age agestr
rename locnace locnace_str
drop pa? 								// in varied formats, so hard to merge with UK data
drop pe?
save "${EHIS}\EHIS wave 2\0_w2merged.dta", replace
/*
use "${EHIS}\EHIS wave 2\0_w2merged.dta", replace
*/

// Merging in UK data (which is properly labelled, as supplied in Stata)
use "${UK_EHIS}\ehis_wave_2_uk.dta", replace
rename *, lower
drop pa?
drop pe?
gen country = "UK"
append using "${EHIS}\EHIS wave 2\0_w2merged.dta", gen(Eurostat)
* One bit of labelling is different
label define hhtype_eurostat -1	"missing (dont know, refusal)" 1 "One-person household" 2	"Lone parent with child(ren) aged less than 25" ///
	3	"Couple without child(ren) aged less than 25" 2	"Couple with child(ren) aged less than 25" 											///
	4	"Couple or lone parent with child(ren) aged less than 25 and other persons living in household"	5	"Other type of household", replace
label values hhtype hhtype_eurostat
drop locnace
destring locnace_str, replace
ren locnace_str locnace
label var locnace " Statistical Classification of Economic Activities in the European Community: Grouped Sectors"
label values pl2 PL4		// in UK data categories 3 & 4 are combined for pl2, so need to copy label from another var
* Best to drop the data from the UK data service - but still valuable to have it here because it is nicely labelled!
drop if Eurostat==0
drop Eurostat age bmi
save "${EHIS}\EHIS wave 2\0_w2raw ${versno}.dta", replace


*___________________________________________________________________________________________________________________________________________________________________
*
**# CLEANING
*___________________________________________________________________________________________________________________________________________________________________
use "${EHIS}\EHIS wave 2\0_w2raw ${versno}.dta", replace


// Things that are different in UK and Eurostat data
numlabel _all, add mask(#_)
* refyear and refmonth are supplied in EU data, unlike at w3
* BMI - don't habe UK variable bmi (which is banded version)
label var bm1		"Height without shoes in cm (not UK)"
label var bm2		"Weight without clothes in kg (not UK)"
* Age
label define AGE -1 "-1_inconsistent:16-17" -2 "-2_inconsistent:15-17" -3 "-3_inconsistent:15-19" -4 "-4_inconsistent:18-19" 					///
	-11 "-11_inconsistent:75+" -12 "-12_inconsistent:75_79" -13 "-13_inconsistent:80+" -14 "-14_inconsistent:80-84" -15 "-15_inconsistent:85+" 	///
	3 "3_20-24" 4 "4_25-29" 5 "5_30-34" 6"6_35-39" 7"7_40-44" 8"8_45-49"  9"9_50-54" 10"10_55-59" 11"11_60-64" 12"12_65-69" 13"13_70-74", modify
* Don't need to recode age in UK any more - recode age (1=-1)(2=-4)(14=-12)(15=-14)(16=-15) 
gen		age = 3		if agestr=="20-24"
replace age = 4		if agestr=="25-29"	
replace age = 5		if agestr=="30-34"
replace age = 6		if agestr=="35-39"
replace age = 7		if agestr=="40-44"
replace age = 8		if agestr=="45-49"
replace age = 9		if agestr=="50-54"
replace age = 10	if agestr=="55-59"
replace age = 11	if agestr=="60-64"
replace age = 12	if agestr=="65-69"
replace age = 13	if agestr=="70-74"
replace age = -2 	if agestr=="15-17" 
*replace age = -3 	if agestr=="15-19" 
replace age = -4 	if agestr=="18-19"
replace age = -11	if agestr=="75+"
replace age = -12	if agestr=="75-79"
replace age = -13	if agestr=="80+"
replace age = -14	if agestr=="80-84"
replace age = -15	if agestr=="85+"
label values age AGE
drop agestr
* tab country if inrange(age,3,12)		// to check sample size by country
keep if inrange(age,3,12)


// General survey things
* Setting proxy responses to missing
unab mainvars: hatlevel-ic3
foreach var in `mainvars'	{
	replace `var' = .p if proxy!=1
/**/					}
* Country as categorical (not string) var
encode country, gen(countrynum)
order countrynum, after(country)
* Other vars
recode intmethod (11 40=12) // this is several countries - from w2 Quality Report p26 (see also the Annex), this is self-completion online
label define INTMETHOD 12"12_Self-administered, web questionnaire", modify
replace refyear=2014 if country=="ES"			// see Quality Report p28


// Mental health scale (PHQ-8)
// Note: 291 ppl are labelled 'proxy' in IT, contradicting proxy==1 - given the absence of a refusal category in IT, I've assumed these should be refusals (see also pn1 - 243 of these 291 people overlap)
unab phq: mh*
foreach var in `phq'	{
	recode   `var' (-3 -1=.r)(1=0 "0_Not at all")(2=1 "1_Several days")(3=2 "2_More than half the days")(4=3 "3_Nearly every day"), gen(_`var')
	replace _`var' = .n if inlist(country,"ES")			// removed due to confidentiality reasons, at ES request
/**/					}
*tab country mh1a, m row nof
egen PHQmiss 	=  rowmiss(_mh1a-_mh1h)
egen PHQscore 	= rowtotal(_mh1a-_mh1h) 
recode PHQscore (0/4 = 1 "1_Minimal depression")(5/9=2 "2_Mild depression")(10/14=3 "3_Moderate depression")(15/19=4 "4_Moderately severe depression") ///
	/**/	(20/24=5 "5_Severe depression"), gen(PHQgroup)
replace PHQgroup = .r if inrange(PHQmiss,2,9)
label var PHQmiss			"dv Number of missing items in PHQ scale"
label var PHQscore			"dv Total PHQ score (0-24), inc. partial responses"
label var PHQgroup			"dv PHQ classification, inc. partial responses"
* Individual binary vars for this
tab PHQgroup, gen(PHQgroup)
numlabel PHQgroup, mask(#_) remove
forvalues i = 1/5	{
		local thislab: label PHQgroup `i'
		local thislab = subinstr("`thislab'", " depression", "", .)
		label var PHQgroup`i'	"dv PHQ #`i': `thislab' (inc. partial responses)"
/**/				}
recode PHQgroup (1/2=0 "0_Minimal/mild depression")(3/5=1 "1_Moderate/severe depression"), gen(PHQgroupB)
	label var PHQgroupB		"dv PHQ moderate/severe (inc. partial responses)"
numlabel PHQgroup, mask(#_) add
* Final tidying
unab PHQvars: mh1? 
foreach var in `PHQvars'	{
	local thislab: var lab `var'
	label var `var' 	"PHQ: `thislab'"
/**/						}
order PHQmiss-PHQgroup5 PHQgroupB, after(mh1h)
drop _mh*


// Physical limitations
// 		NOTE: apparently at w2, pl1 and pl3 have =3 options - but for confidentiality doesn't seem to be given here
** Vision ** 
recode pl1 pl2 (-1 -2=.r)
** Hearing **
replace pl5 = 4 if pl4==4 & pl5==-2			// these are people that weren't asked the follow-up hearing q because clear what they would say
/* checking routing is consistent after adjustment in previous line
	egen hearingmiss = concat(pl3-pl5), punct(" | ")
	drop hearingmiss
*/
recode pl3 pl4 pl5 (-1 -2=.r)
** Walking **
*tab country pl7 if pl6==4, m						//  routing is consistent
recode pl6 pl7 (-1 -2=.r)


// Pain
tab country pn2 if pn1==1, row nof
replace pn2 = 1 if pn1==1 & inlist(pn2,-1) & country=="FI"			// just in FI (and also at w3), most ppl didn't respond to pn2 if they said 'not at all' in pn1. In other countries ≈97% who said 'not at all' at pn1 said 'not at all' at pn2. 
recode pn1 pn2 (-1 -3=.r)


// Condition vars
recode cd1a cd1b cd1c cd1d cd1e cd1f cd1g cd1h cd1i cd1j cd1k cd1l cd1m cd1n cd1o /*cd1p*/ (-1=.r)(2=0), gen(cd1aR cd1bR cd1cR cd1dR cd1eR cd1fR cd1gR cd1hR cd1iR cd1jR cd1kR cd1lR cd1mR cd1nR cd1oR)
foreach var in a b c d e f g h i j k l m n o	{
	local thislab: var label cd1`var'
	label var cd1`var'R "`thislab' recoded"
/**/											}
order cd1aR-cd1oR, after(cd1o)
drop cd1a-cd1o


// BMI
/* Looking at min and max values, given different rules about this in different countries
version 14
local var "bm1"
table country if sex==1 & `var'>0, c(count pid count `var' min `var' max `var' )
table country if sex==2 & `var'>0, c(count pid count `var' min `var' max `var' )
local var "bm2"
table country if sex==1 & `var'>0, c(count pid count `var' min `var' max `var' )
table country if sex==2 & `var'>0, c(count pid count `var' min `var' max `var' )
*/
* BMI itself is not supplied - just height and weight, unlike w3. But should be possible to create a version here:
* The revisions below comes from the w2 PDF document from Eurostat - the issue is really Italy does this (and to a lesser extent Ireland)
	recode bm1 bm2 (-1=.r)
	gen 	_bmiR =  bm2 / ((bm1/100)^2) if sex==1 	// men
	replace _bmiR =  bm2 / ((bm1/100)^2) if sex==2 	// women
	replace _bmiR = .p if bm2==.p
	recode _bmiR (.=.r)(.p=.p)(0/18.49999=1) (18.5/24.9999=2) (25/29.99999=3) (30/39.9999=4) (40/max=5), gen(bmigroup)
	label var bmigroup			"BMI group (dv, see notes)"
	replace bmigroup = .o if country=="IT"					// very strange recoding rules so not really comparable
label define BMI 1	"1_BMI less than 18.5" 	2	"2_BMI 18.5 to less than 25" 3	"3_BMI 25 to less than 30" 4 "4_BMI 30 to less than 40" 5 "5_BMI 40 or more", replace
label values bmigroup BMI
order bmigroup, after(bm2)
drop _bmiR


// LLSI 
recode hs3 (-1=.r)
recode hs3 (1=1 "1_severely limited") (2 3=0), gen(severellsiB)
recode hs3 (1 2=1 "1_any limitations")(  3=0), gen(llsiB)
	label var llsiB 		"dv: LLSI any limitations (from hs3)"
	label var severellsiB 	"dv: LLSI severe limitations (from hs3)"
order *llsiB, after(hs3)


// Employment status
recode mainstat (-1=.r)	// (70=80) not needed in w2 - 70=Compulsory military or civilian service, which is recoded into 'Other' in UK for anonymity reasons (and in several countries by the looks of it)
recode mainstat (10=1 "1_working")(20/80=0 "0_not working"), gen(workB)
order workB, after(mainstat)
label var workB 		"dv Working (binary var)"


// Education - just doesn't really look comparable


// Weights
label var wgt				"Weight inc proxy interviews"
/* Checks
label define weightpattern 1 "1_both w8s" 2 "2_non-proxy but missing wgt_spec" 3 "3_non-proxy but missing wgt" 4 "4_missing both" 9 "9_proxy & missing wgt_spec"
gen weightpattern = 1 if !missing(wgt) & !missing(wgt_spec)
	replace weightpattern = 2 if  missing(wgt_spec) & !missing(wgt) 
	replace weightpattern = 3 if !missing(wgt_spec) &  missing(wgt) 
	replace weightpattern = 4 if  missing(wgt_spec) &  missing(wgt) 
label values weightpattern weightpattern 
bysort proxy: tab country weightpattern 
*/
svyset [pw=wgt]


// Combining rare categories
recode PHQgroup (1=1 "1_Minimal depression")(2=2 "2_Mild depression")(3=3 "3_Moderate depression")(4 5=4 "4_Moderately severe or severe depression"), gen(PHQgroupR)
	order PHQgroupR, after(PHQgroup)
label define plR 1 "1_No difficulty" 2 "2_Some difficulty" 3 "3_A lot of difficulty/cannot do at all"
	recode pl2 pl4 pl5 pl6 pl7 (3/4=3), gen(pl2R pl4R pl5R pl6R pl7R)
	label values pl2 pl4R pl5R pl6R pl7R plR
	foreach i in 2 4 5 6 7	{
		local thislab: var label pl`i'
		label var pl`i'R "`thislab' recoded"
	/**/					}
label define pl6and7R 0 "0_no difficulties" 1 "1_some difficulty with either walking 1/2km or steps" 2 "2_some difficulty with both" 3 "3_a lot of difficulty with at least one", replace
	gen 	pl6and7R = 0 if pl6R==1 & pl7R==1
	replace pl6and7R = 1 if inlist(2,pl6R,pl7R)
	replace pl6and7R = 2 if pl6R==2 & pl7R==2
	replace pl6and7R = 3 if inlist(3,pl6R,pl7R)
	label values pl6and7R pl6and7R 
	label var pl6and7R "Difficulty in walking on level ground and up steps combined (from pl6 and pl7)"
order pl2R pl4R pl5R pl6R pl7R pl6and7R, after(pl7)


// Final tidying
order country countrynum age, before(sex)
drop ac1a-ac2 ho1-ic3
drop birthplace*
compress
erase 	"${EHIS}\EHIS wave 2\0_w2raw ${versno}.dta"	
erase 	"${EHIS}\EHIS wave 2\0_w2merged.dta"
save "${EHIS}\EHIS wave 2\0_w2cleaned ${versno}.dta", replace
/*
	use "${EHIS}\EHIS wave 2\0_w2cleaned ${versno}.dta", replace
*/
